載入必要套件

if (!require("data.table")) install.packages("data.table")
if (!require("dplyr")) install.packages("dplyr") 
if (!require("reshape2")) install.packages("reshape2") 
if (!require("ggplot2")) install.packages("ggplots") 
if (!require("scales")) install.packages("scales") 
if (!require("devtools")) install.packages("devtools") 
if (!require("ggfortify")) install_github('sinhrks/ggfortify')


library(data.table) # data ETL 套件 
library(dplyr)      # data ETL 套件
library(reshape2)   # data ETL 套件
library(ggplot2)    # 視覺化套件
library(scales)     # 改變ggplot2座標軸刻度
library(devtools)
library(ggfortify)

利用fread將檔案讀入R

# 填入您存放資料的路徑 (可以是相對/絕對路徑)
raw <- fread("~/DSP/A2/hypermall_H1.csv", data.table=FALSE)
sup <- fread("~/DSP/A2/hypermall_supplement.csv", data.table=FALSE)
str(raw)
'data.frame':   391642 obs. of  9 variables:
 $ Store       : chr  "H1" "H1" "H1" "H1" ...
 $ Dept        : chr  "IDz-9MIl" "IDz-9MIl" "IDz-9MIl" "IDz-9MIl" ...
 $ Date        : chr  "01/01/2009" "01/01/2009" "01/01/2009" "01/01/2009" ...
 $ Family      : chr  "滷.煮一" "烤雞肉一" "肉製品一" "素食商品" ...
 $ Sales_Value : chr  "23376" "16747" "10076" "9765" ...
 $ Sales_Qty   : chr  "516" "368" "59" "179" ...
 $ Sales_Number: chr  "195" "149" "53" "140" ...
 $ Item_Type   : chr  "49" "17" "28" "19" ...
 $ Price_Mode  : chr  "11" "30" "188" "45" ...
# 將字串型的變數轉成適當的資料型態
raw <- mutate(raw, 
              Date=as.Date(Date, format="%m/%d/%Y"),
              Sales_Value=as.numeric(Sales_Value), 
              Sales_Qty=as.numeric(Sales_Qty),
              Sales_Number=as.numeric(Sales_Number), 
              Item_Type=as.numeric(Item_Type),
              Price_Mode=as.numeric(Price_Mode)
              )

探索性資料分析 (Exploratory Data Analysis)

# ggplot 起手式
ggplot(data, aes(x=a, y=b, ...)) + geom_xxx()
# 客製化 ggplot2 佈景主題
thm <- function() {
  theme_gray(base_family = "STHeiti") + # 讓Mac使用者能夠顯示中文, Windows使用者應省略這行
  theme(text=element_text(size=18)) # 將字體調整至18號
}

df1 <- group_by(raw, Date) %>% 
  summarise(Sales_Value=sum(Sales_Value), Sales_Number=sum(Sales_Number))

ggplot(df1, aes(x=Date, y=Sales_Number)) + geom_point()

ggplot(df1, aes(x=Date, y=Sales_Number)) + geom_point(size=4) + 
  labs(x="日期", y="銷售次數") + thm() + 
  scale_y_continuous(labels=comma) +
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Date, y=Sales_Value)) + geom_point(size=4) + 
  labs(x="日期", y="銷售金額") +  thm() + 
  scale_y_continuous(labels=comma) + 
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Sales_Number, y=Sales_Value)) + geom_point(size=4) +
  labs(x="銷售次數", y="銷售金額") + thm() + 
  scale_x_continuous(labels=comma) + 
  scale_y_continuous(labels=comma) 

加入額外的變數作圖

  • 日期 vs. 銷售次數 vs. 週間
  • 日期 vs. 銷售金額 vs. 週間
  • 銷售次數 vs. 銷售金額
df1 <- mutate(df1, is.weekday=strftime(Date, "%u")<6)

ggplot(df1, aes(x=Date, y=Sales_Number, colour=is.weekday)) + geom_point(size=4) + 
  labs(x="日期", y="銷售次數") + thm() + 
  scale_y_continuous(labels=comma) +
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Date, y=Sales_Value, colour=is.weekday)) + geom_point(size=4) + 
  labs(x="日期", y="銷售金額") +  thm() + 
  scale_y_continuous(labels=comma) + 
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df1, aes(x=Sales_Number, y=Sales_Value, colour=is.weekday)) + geom_point(size=4) +
  labs(x="銷售次數", y="銷售金額") + thm() + 
  scale_x_continuous(labels=comma) + 
  scale_y_continuous(labels=comma) 

  • 日期 vs. 銷售金額 vs. 節日
  • 日期 vs. 銷售金額 vs. 颱風
  • 日期 vs. 銷售金額 vs. 雨量
str(sup)
'data.frame':   1095 obs. of  8 variables:
 $ Date      : chr  "01/01/2009" "01/02/2009" "01/03/2009" "01/04/2009" ...
 $ Lunar_Date: chr  "農曆臘月初六日" "農曆臘月初七日" "農曆臘月初八日" "農曆臘月初九日" ...
 $ Wyear     : chr  "W1" "W1" "W1" "W1" ...
 $ Wday      : chr  "星期四" "星期五" "星期六" "星期日" ...
 $ Store     : chr  "H1" "H1" "H1" "H1" ...
 $ is.bigday : chr  "0" "0" "0" "0" ...
 $ Rainfall  : chr  "0.2" "0" "0" "8.5" ...
 $ is.typhoon: chr  "0" "0" "0" "0" ...
# 將字串型的變數轉成適當的資料型態
sup <- mutate(sup,
              Date=as.Date(Date, format="%m/%d/%Y"),
              is.bigday=as.logical(as.integer(is.bigday)),
              is.typhoon=as.logical(as.integer(is.typhoon)),
              is.weekday=strftime(Date, format="%u")<6,
              Rainfall=as.numeric(Rainfall)
              )
df2 <- left_join(raw, sup, by=c("Date", "Store")) %>% 
  group_by(Date, is.weekday, is.bigday, is.typhoon, Rainfall) %>% 
  summarise(Sales_Value=sum(Sales_Value), Sales_Number=sum(Sales_Number))

ggplot(df2, aes(x=Date, y=Sales_Value, colour=is.bigday)) + geom_point(size=4) + 
  labs(x="日期", y="銷售金額") + thm() + 
  scale_y_continuous(labels=comma) +
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df2, aes(x=Date, y=Sales_Value, colour=is.typhoon)) + geom_point(size=4) + 
  labs(x="日期", y="銷售金額") +  thm() + 
  scale_y_continuous(labels=comma) + 
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month"))

ggplot(df2, aes(x=Date, y=Sales_Value, colour=Rainfall)) + 
  geom_point(size=4) + 
  labs(x="日期", y="銷售金額") +  thm() + 
  scale_y_continuous(labels=comma) + 
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month")) + 
  scale_colour_gradient2(low="#99FF00", mid="#81DB5A", high="#287AA9", midpoint = 20)

df2 <- mutate(df2, Rain_Lv=cut(Rainfall,
                               breaks=c(0, 3, 50 ,130, 200, Inf), 
                               labels=c("無雨","小雨","大雨","豪雨","大豪雨"),
                               right=FALSE))

ggplot(df2, aes(x=Date, y=Sales_Value, colour=Rain_Lv)) + 
  geom_point(size=4) + 
  labs(x="日期", y="銷售金額") +  thm() + 
  scale_y_continuous(labels=comma) + 
  scale_x_date(labels=date_format("%m"), breaks = date_breaks("1 month")) + 
  scale_colour_brewer(palette = "Set2")

ggplot(df2, aes(x=Sales_Number, y=Sales_Value)) + 
  geom_point(aes(colour=is.weekday, shape=is.bigday), size=4) +
  labs(x="銷售次數", y="銷售金額") + thm() + 
  scale_x_continuous(labels=comma) + 
  scale_y_continuous(labels=comma) 

df2 <- mutate(df2, group=factor(paste(is.weekday, is.bigday, sep="-"), 
                           levels=c("FALSE-FALSE", "FALSE-TRUE", 
                                    "TRUE-FALSE", "TRUE-TRUE"),
                           labels=c("週間,非節日", "週間,節日", 
                                    "非週間,非節日", "非週間,節日")))

ggplot(df2, aes(x=Sales_Number, y=Sales_Value)) + 
  geom_point(aes(colour=group), size=4) +
  labs(x="銷售次數", y="銷售金額") + thm() + 
  scale_x_continuous(labels=comma) + 
  scale_y_continuous(labels=comma) 

建立線性模型

fit1 <- lm(Sales_Value ~ Sales_Number, data=df2)
summary(fit1)

Call:
lm(formula = Sales_Value ~ Sales_Number, data = df2)

Residuals:
     Min       1Q   Median       3Q      Max 
-1782242  -138676   -33780   109120  1884361 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -1.066e+05  4.469e+04  -2.386   0.0175 *  
Sales_Number  1.401e+02  1.522e+00  92.021   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 359200 on 363 degrees of freedom
Multiple R-squared:  0.9589,    Adjusted R-squared:  0.9588 
F-statistic:  8468 on 1 and 363 DF,  p-value: < 2.2e-16
fit2 <- lm(Sales_Value ~ Sales_Number + is.weekday + is.bigday
           + is.weekday:is.bigday, data=df2)
summary(fit2)

Call:
lm(formula = Sales_Value ~ Sales_Number + is.weekday + is.bigday + 
    is.weekday:is.bigday, data = df2)

Residuals:
     Min       1Q   Median       3Q      Max 
-1856027  -127133   -10140   110408  1535202 

Coefficients:
                               Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  -3.510e+05  8.611e+04  -4.076 5.64e-05 ***
Sales_Number                  1.414e+02  2.212e+00  63.925  < 2e-16 ***
is.weekdayTRUE                2.437e+05  5.320e+04   4.580 6.41e-06 ***
is.bigdayTRUE                 3.981e+05  1.165e+05   3.417 0.000705 ***
is.weekdayTRUE:is.bigdayTRUE -1.344e+05  1.234e+05  -1.089 0.276952    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 331800 on 360 degrees of freedom
Multiple R-squared:  0.9652,    Adjusted R-squared:  0.9648 
F-statistic:  2498 on 4 and 360 DF,  p-value: < 2.2e-16
out1 <- data.frame(x=df2$Sales_Number, y=predict(fit1))
out2 <- data.frame(x=df2$Sales_Number, y=predict(fit2))

ggplot(df2, aes(x=Sales_Number, y=Sales_Value)) + geom_point(size=4) +
  geom_line(aes(x=x,y=y), data=out1, col=2, size=1) +
  geom_point(aes(x=x,y=y), data=out2, col=4, size=2) +
  labs(x="銷售次數", y="銷售金額") + thm() + 
  scale_x_continuous(labels=comma) + 
  scale_y_continuous(labels=comma)

模型診斷

# Total sum of squared error
sum((df2$Sales_Value - out1$y)^2) 
[1] 4.684006e+13
# R-squared
1 - sum((df2$Sales_Value - out1$y)^2) / sum((df2$Sales_Value - mean(df2$Sales_Value))^2)
[1] 0.9588946
sum((df2$Sales_Value - out2$y)^2)
[1] 3.963419e+13
1 - sum((df2$Sales_Value - out2$y)^2) / sum((df2$Sales_Value - mean(df2$Sales_Value))^2)
[1] 0.9652182
anova(fit1, fit2)
Analysis of Variance Table

Model 1: Sales_Value ~ Sales_Number
Model 2: Sales_Value ~ Sales_Number + is.weekday + is.bigday + is.weekday:is.bigday
  Res.Df        RSS Df  Sum of Sq      F    Pr(>F)    
1    363 4.6840e+13                                   
2    360 3.9634e+13  3 7.2059e+12 21.817 5.278e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit1)

Call:
lm(formula = Sales_Value ~ Sales_Number, data = df2)

Residuals:
     Min       1Q   Median       3Q      Max 
-1782242  -138676   -33780   109120  1884361 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -1.066e+05  4.469e+04  -2.386   0.0175 *  
Sales_Number  1.401e+02  1.522e+00  92.021   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 359200 on 363 degrees of freedom
Multiple R-squared:  0.9589,    Adjusted R-squared:  0.9588 
F-statistic:  8468 on 1 and 363 DF,  p-value: < 2.2e-16
summary(fit2)

Call:
lm(formula = Sales_Value ~ Sales_Number + is.weekday + is.bigday + 
    is.weekday:is.bigday, data = df2)

Residuals:
     Min       1Q   Median       3Q      Max 
-1856027  -127133   -10140   110408  1535202 

Coefficients:
                               Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  -3.510e+05  8.611e+04  -4.076 5.64e-05 ***
Sales_Number                  1.414e+02  2.212e+00  63.925  < 2e-16 ***
is.weekdayTRUE                2.437e+05  5.320e+04   4.580 6.41e-06 ***
is.bigdayTRUE                 3.981e+05  1.165e+05   3.417 0.000705 ***
is.weekdayTRUE:is.bigdayTRUE -1.344e+05  1.234e+05  -1.089 0.276952    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 331800 on 360 degrees of freedom
Multiple R-squared:  0.9652,    Adjusted R-squared:  0.9648 
F-statistic:  2498 on 4 and 360 DF,  p-value: < 2.2e-16
autoplot(fit1)

autoplot(fit2)